##### ########################################################## ######
#####                                                            ######
#####   Input: debates                                           ######
#####   Output: 1) Complexity scores at the MP-debate level
#####   Output: 2) Complexity scores at the sentence level for validation
#####
##### ########################################################## ######

rm(list=ls())
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
stdize <- function(x) (x - mean(x, na.rm = T))/sd(x, na.rm = T)

# Load libraries

library(quanteda) # CRAN v.3.0.0
library(data.table) # CRAN v.1.13.6
library(quanteda.dictionaries) # v.0.22 
library(quanteda.textstats) # v0.94
library(text2vec) # CRAN v.0.6
library(plyr) # CRAN v.1.8.6

set.seed(221186)

  
# Load data

load("data/debates.Rdata")


y <- unique(debates$yearmon)[1]

complexity_year_out_list <- list()
i<-0
for(y in unique(debates$yearmon)){
  
  if(is.na(y)) next
  
  print(y)
  i<-i+1
  debates_tmp <- debates[yearmon == y]
  complexity_year <- debates_tmp[,c("epobject_id","body")]
  
  # Calculate speech complexity 
  complexity <- textstat_readability(complexity_year$body, measure = "Flesch.Kincaid")
  complexity_year$complexity <- NA
  complexity_year$complexity[as.numeric(gsub("text","",complexity$document))] <- complexity$Flesch.Kincaid
  
  complexity_year_out_list[[i]] <- complexity_year[,c("epobject_id","complexity")]
  
}

complexity_scores <- data.table(rbind.fill(complexity_year_out_list))

# Reverse polarity
complexity_scores$complexity <- complexity_scores$complexity*-1

# Save
save(complexity_scores, file = "working/complexity_speech.Rdata")



## Calculate sentence level complexity scores
  
# Load data

load("working/dictionaries_sentence.Rdata")

# Set up subsetting vectors

dictionary_scores$subset_value <- rep(1:ceiling(nrow(dictionary_scores)/1000), each = 1000)[1:nrow(dictionary_scores)]

complexity_year_out_list <- list()
y<-20
final_subset <- max(dictionary_scores$subset_value)
i<-0
for(y in unique(dictionary_scores$subset_value)){
  cat(".")
  if(y %% 10 == 0) print(paste0(round(y/final_subset,3)*100,"% complete"))
  
  i<-i+1
  debates_tmp <- dictionary_scores[subset_value == y,]
  
  complexity_year <- debates_tmp[,c("epobject_id","sent")]
  
  # Calculate speech complexity 
  complexity <- textstat_readability(complexity_year$sent, measure = "Flesch.Kincaid")
  complexity_year$complexity <- NA
  complexity_year$complexity[as.numeric(gsub("text","",complexity$document))] <- complexity$Flesch.Kincaid
  
  complexity_year_out_list[[i]] <- complexity_year[,c("epobject_id","complexity")]
  
}

complexity_scores <- data.table(rbind.fill(complexity_year_out_list))

save(complexity_scores, file = "working/complexity_sentence.Rdata")
  
